from lxml import html
import urllib
import re

import requests
import random
import readua

uas = readua.read_ua('UA.txt')

def str_union(s1, s2):
    union_parts = []
    for i in range(1, len(s1)+1):
        if i <= len(s2):
            if s1[-i:] == s2[:i]:
                union_parts.append(s2[:i])
    if union_parts == []:
        return s1 + s2
    else:
        return s1.replace(union_parts[-1], "") + s2


def get_html(url):

    headers = {
        'User-Agent': random.choice(uas)
    }
    # print(headers)
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        return r.text
    except:
        return ''

def get_books(surl):
    page = get_html(surl)
    pattern = re.compile(r'<a href="(/\d+/\d+/)".*?>(.*?)</a>')
    #pattern = re.compile(r'<a href="(/\d\d/\d\d\d\d\d/)" target="_blank" title=".*?">(.*?)</a>')
    results = pattern.findall(page)
    books = []
    for url, title in results:
        url = surl + url
        book = {
            'url'  : url,
            'title': title
        }
        books.append(book)
    return books
    #print(titles)

def get_chapters(burl, book_name):
    page = get_html(burl)
    hometree = html.fromstring(page)
    titles = hometree.xpath('//div[@class="listmain"]/dl/dd/a/text()')[6:]
    urls = hometree.xpath('//div[@class="listmain"]/dl/dd/a/@href')[6:]
    chapters = []
    for i in range(len(urls)):
        chapter = {
            'url': str_union(burl, urls[i]),
            'title': titles[i],
            'book_name': book_name
        }
        chapters.append(chapter)
    return chapters

def search(keyword):
    qurl = f'https://www.lewen.cc/s.php?ie=gbk&q={urllib.parse.quote_plus(keyword)}'
    surl = 'https://www.lewen.cc'
    page = get_html(qurl)
    hometree = html.fromstring(page)
    book_titles = hometree.xpath('//div[@class="bookinfo"]/h4/a/text()')
    book_urls = [surl+url for url in hometree.xpath('//div[@class="bookinfo"]/h4/a/@href')]
    book_tags = [tag.replace("分类：", "") for tag in hometree.xpath('//div[@class="bookinfo"]/div[1]/text()')]
    book_authors = [author.replace("作者：", "") for author in hometree.xpath('//div[@class="bookinfo"]/div[2]/text()')]
    book_news = hometree.xpath('//div[@class="bookinfo"]/div[3]/a/text()')
    books = []
    l = len(book_titles)
    for i in range(l):
        book = {
            'title': book_titles[i],
            'url':   book_urls[i],
            'author':book_authors[i],
            'tag':   book_tags[i],
            'new':   book_news[i]
        }
        books.append(book)
    return books
    

if __name__ == '__main__':
    url = 'https://www.lewen.cc'
    #books = search('盖世')
    #for book in books:
    #    print(book)
    #books = get_books(url)
    #print(books)
    chapters = get_chapters('https://www.lewen.cc/94/94468/', '最强道长：开局就是神仙')
    print(chapters)
